import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler , LabelEncoder
from scipy import stats
import warnings
import keras
import random
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import (roc_auc_score, roc_curve)
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, \
recall_score, confusion_matrix, classification_report, \
accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,classification_report,plot_roc_curve,accuracy_score
df_bank = pd.read_csv("F:/Bootcamp Data Science/Final Project/Banking Dataset - Marketing Targets Maybe using this dataset/bank.csv")
df_bank.rename(columns={'y': 'subscribed'} , inplace = True)
df_bank
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 58 | management | married | tertiary | no | 2143 | yes | no | unknown | 5 | may | 261 | 1 | -1 | 0 | unknown | no |
| 1 | 44 | technician | single | secondary | no | 29 | yes | no | unknown | 5 | may | 151 | 1 | -1 | 0 | unknown | no |
| 2 | 33 | entrepreneur | married | secondary | no | 2 | yes | yes | unknown | 5 | may | 76 | 1 | -1 | 0 | unknown | no |
| 3 | 47 | blue-collar | married | unknown | no | 1506 | yes | no | unknown | 5 | may | 92 | 1 | -1 | 0 | unknown | no |
| 4 | 33 | unknown | single | unknown | no | 1 | no | no | unknown | 5 | may | 198 | 1 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 49727 | 33 | services | married | secondary | no | -333 | yes | no | cellular | 30 | jul | 329 | 5 | -1 | 0 | unknown | no |
| 49728 | 57 | self-employed | married | tertiary | yes | -3313 | yes | yes | unknown | 9 | may | 153 | 1 | -1 | 0 | unknown | no |
| 49729 | 57 | technician | married | secondary | no | 295 | no | no | cellular | 19 | aug | 151 | 11 | -1 | 0 | unknown | no |
| 49730 | 28 | blue-collar | married | secondary | no | 1137 | no | no | cellular | 6 | feb | 129 | 4 | 211 | 3 | other | no |
| 49731 | 44 | entrepreneur | single | tertiary | no | 1136 | yes | yes | cellular | 3 | apr | 345 | 2 | 249 | 7 | other | no |
49732 rows × 17 columns
df_bank.sample(10)
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 28280 | 60 | retired | married | unknown | no | 3875 | yes | no | cellular | 29 | jan | 55 | 1 | -1 | 0 | unknown | no |
| 39415 | 39 | management | single | tertiary | no | 307 | no | no | cellular | 22 | may | 182 | 2 | -1 | 0 | unknown | yes |
| 44569 | 30 | admin. | single | secondary | no | 125 | yes | no | cellular | 16 | aug | 140 | 3 | 116 | 1 | other | no |
| 37501 | 32 | blue-collar | married | secondary | no | 335 | yes | no | cellular | 13 | may | 240 | 3 | 370 | 1 | failure | no |
| 13448 | 30 | management | single | secondary | no | 1090 | yes | no | cellular | 9 | jul | 65 | 1 | -1 | 0 | unknown | no |
| 24835 | 42 | entrepreneur | married | tertiary | no | 257 | no | yes | cellular | 18 | nov | 340 | 2 | -1 | 0 | unknown | no |
| 1485 | 42 | technician | married | secondary | no | 0 | no | yes | unknown | 8 | may | 7 | 3 | -1 | 0 | unknown | no |
| 29320 | 47 | technician | married | secondary | no | 1167 | yes | no | cellular | 2 | feb | 139 | 3 | 73 | 2 | success | no |
| 43514 | 42 | housemaid | divorced | primary | no | 273 | no | no | cellular | 21 | apr | 354 | 1 | -1 | 0 | unknown | no |
| 1247 | 49 | services | married | secondary | no | -8 | yes | no | unknown | 8 | may | 1119 | 1 | -1 | 0 | unknown | yes |
df_bank["subscribed"].value_counts()
no 43922 yes 5810 Name: subscribed, dtype: int64
#read train and test dataset
#df1 = pd.read_csv("F:/Bootcamp Data Science/Final Project/Banking Dataset - Marketing Targets Maybe using this dataset/train.csv", sep=';')
#df1
#df2 = pd.read_csv("F:/Bootcamp Data Science/Final Project/Banking Dataset - Marketing Targets Maybe using this dataset/test.csv" , sep=';' , engine='python')
#df2.rename(columns={'y': 'subscribed'} , inplace = True)
#df2.tail()
#Copy data bank
X = df_bank.copy()
#concat 2 datsets
#X = pd.concat([df1,df2])
#X = X.drop_duplicates()
#X.head()
train , test = train_test_split(X,test_size=0.2, random_state=1)
train
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 823 | 33 | blue-collar | single | primary | no | 95 | yes | no | unknown | 7 | may | 102 | 1 | -1 | 0 | unknown | no |
| 32895 | 38 | blue-collar | married | secondary | no | 194 | yes | no | cellular | 17 | apr | 576 | 4 | 340 | 7 | failure | no |
| 31304 | 19 | student | single | secondary | no | 1803 | no | no | cellular | 10 | mar | 59 | 1 | -1 | 0 | unknown | no |
| 15594 | 25 | blue-collar | married | secondary | no | 895 | no | no | cellular | 21 | jul | 317 | 3 | -1 | 0 | unknown | no |
| 14670 | 45 | management | married | tertiary | no | 0 | no | yes | cellular | 15 | jul | 112 | 4 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43723 | 50 | technician | married | tertiary | no | 1830 | no | no | cellular | 14 | may | 318 | 3 | -1 | 0 | unknown | no |
| 32511 | 34 | management | divorced | tertiary | no | 528 | yes | no | cellular | 17 | apr | 233 | 2 | -1 | 0 | unknown | no |
| 5192 | 48 | management | married | tertiary | no | 4191 | no | no | unknown | 21 | may | 363 | 6 | -1 | 0 | unknown | no |
| 12172 | 45 | admin. | married | unknown | no | 1783 | no | no | unknown | 20 | jun | 11 | 11 | -1 | 0 | unknown | no |
| 33003 | 45 | blue-collar | married | unknown | no | 349 | yes | no | cellular | 17 | apr | 179 | 2 | 336 | 2 | other | no |
39785 rows × 17 columns
test
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | blue-collar | single | primary | no | 147 | yes | no | unknown | 2 | jun | 102 | 2 | -1 | 0 | unknown | no |
| 21927 | 32 | technician | married | secondary | no | 20 | no | no | cellular | 20 | aug | 210 | 2 | -1 | 0 | unknown | no |
| 9756 | 34 | housemaid | married | tertiary | no | 5011 | yes | no | unknown | 9 | jun | 311 | 2 | -1 | 0 | unknown | no |
| 14416 | 47 | technician | married | secondary | no | 0 | no | no | telephone | 14 | jul | 81 | 2 | -1 | 0 | unknown | no |
| 35457 | 48 | management | divorced | secondary | no | 122 | yes | no | telephone | 7 | may | 325 | 1 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2402 | 23 | admin. | single | secondary | no | 281 | yes | no | unknown | 13 | may | 124 | 1 | -1 | 0 | unknown | no |
| 6976 | 51 | technician | divorced | secondary | no | 0 | yes | no | unknown | 28 | may | 463 | 2 | -1 | 0 | unknown | no |
| 7235 | 51 | management | married | tertiary | no | 0 | yes | yes | unknown | 29 | may | 410 | 5 | -1 | 0 | unknown | no |
| 16819 | 39 | blue-collar | married | secondary | no | -66 | no | yes | telephone | 24 | jul | 626 | 5 | -1 | 0 | unknown | no |
| 12936 | 31 | management | single | tertiary | no | 1598 | yes | no | cellular | 7 | jul | 36 | 33 | -1 | 0 | unknown | no |
9947 rows × 17 columns
X
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 58 | management | married | tertiary | no | 2143 | yes | no | unknown | 5 | may | 261 | 1 | -1 | 0 | unknown | no |
| 1 | 44 | technician | single | secondary | no | 29 | yes | no | unknown | 5 | may | 151 | 1 | -1 | 0 | unknown | no |
| 2 | 33 | entrepreneur | married | secondary | no | 2 | yes | yes | unknown | 5 | may | 76 | 1 | -1 | 0 | unknown | no |
| 3 | 47 | blue-collar | married | unknown | no | 1506 | yes | no | unknown | 5 | may | 92 | 1 | -1 | 0 | unknown | no |
| 4 | 33 | unknown | single | unknown | no | 1 | no | no | unknown | 5 | may | 198 | 1 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 49727 | 33 | services | married | secondary | no | -333 | yes | no | cellular | 30 | jul | 329 | 5 | -1 | 0 | unknown | no |
| 49728 | 57 | self-employed | married | tertiary | yes | -3313 | yes | yes | unknown | 9 | may | 153 | 1 | -1 | 0 | unknown | no |
| 49729 | 57 | technician | married | secondary | no | 295 | no | no | cellular | 19 | aug | 151 | 11 | -1 | 0 | unknown | no |
| 49730 | 28 | blue-collar | married | secondary | no | 1137 | no | no | cellular | 6 | feb | 129 | 4 | 211 | 3 | other | no |
| 49731 | 44 | entrepreneur | single | tertiary | no | 1136 | yes | yes | cellular | 3 | apr | 345 | 2 | 249 | 7 | other | no |
49732 rows × 17 columns
df_bank.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 49732 entries, 0 to 49731 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 49732 non-null int64 1 job 49732 non-null object 2 marital 49732 non-null object 3 education 49732 non-null object 4 default 49732 non-null object 5 balance 49732 non-null int64 6 housing 49732 non-null object 7 loan 49732 non-null object 8 contact 49732 non-null object 9 day 49732 non-null int64 10 month 49732 non-null object 11 duration 49732 non-null int64 12 campaign 49732 non-null int64 13 pdays 49732 non-null int64 14 previous 49732 non-null int64 15 poutcome 49732 non-null object 16 subscribed 49732 non-null object dtypes: int64(7), object(10) memory usage: 6.5+ MB
Based on the displayed result above
Categorical Data : the datatype of job, marital, education, default, housing, loan, contact, month, poutcome are object.
Numerical Data : Age, balance, day, duration, campaign, pdays, previous are int64, it means that they are numerical variables.
plt.figure(figsize=(20,10))
sns.heatmap(df_bank.corr(),annot = True)
<matplotlib.axes._subplots.AxesSubplot at 0x24f9a013a90>
From the graph above, it seems like nothing highly correlated as most of the values is below 0.5. 1 means is correlated while 0 means no correlation. There is no correlation, no relationship between the independent variables.So, it is good practice to check it before selecting the variables into the regression model since is one of the steps to avoid overfitting. Correlation matrix: (0<=|c|<0.3: weak), (0.3<=|c|<0.7: moderate) & (0.7<=|c|<1: strong)
def cat_data(title):
#train = pd.concat([Xtrain,ytrain],axis=1)
ax=sns.catplot(x=title,kind='count', hue="subscribed", palette='viridis', data=df_bank)
def categorical_data(title):
plt.figure(figsize=(17,8))
g=sns.catplot(x=title,kind='count', hue="subscribed", palette='viridis', data=df_bank)
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x() + 0.02,
p.get_height() * 1.02,
'{0:.2f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large')
categorical_data("marital")
#marital_data = pd.crosstab(df_bank["marital"],df_bank["subscribed"])
#print(marital_data)
<Figure size 1224x576 with 0 Axes>
plt.figure(figsize=(13,7))
categorical_data("default")
<Figure size 936x504 with 0 Axes>
<Figure size 1224x576 with 0 Axes>
plt.figure(figsize=(13,7))
ax = sns.countplot(x='job', hue="subscribed", palette='viridis', data=df_bank)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
#Plus a label number count in each bar
for p in ax.patches:
ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))
plt.tight_layout()
plt.show()
#pd.crosstab(Xtrain.loan.values,ytrain.values)
categorical_data("contact")
#loan_train = pd.crosstab(Xtrain.loan,ytrain)
#print(loan_train)
<Figure size 1224x576 with 0 Axes>
#Xtrain.loan.value_counts()
#ytrain.shape
categorical_data("loan")
#loan_train = pd.crosstab(Xtrain.loan,ytrain)
#print(loan_train)
<Figure size 1224x576 with 0 Axes>
cat_data("month")
#loan_train = pd.crosstab(Xtrain.loan,ytrain)
#print(loan_train)
plt.figure(figsize=(13,7))
ax = sns.countplot(x='month', hue="subscribed", palette='viridis', data=df_bank)
#ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
#Plus a label number count in each bar
for p in ax.patches:
ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))
plt.tight_layout()
plt.title("Bivariate analysis of the relationship between 'Month' and 'Subscribed'")
plt.ylabel("Count")
plt.xlabel("Month")
plt.show()
plt.figure(figsize=(13,7))
ax = sns.countplot(x='poutcome', hue="subscribed", palette='viridis', data=df_bank)
#ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
#Plus a label number count in each bar
for p in ax.patches:
ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+50))
plt.tight_layout()
plt.show()
# desciptive analysis for numerical columns
df_bank.describe()
| age | balance | day | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| count | 49732.000000 | 49732.000000 | 49732.000000 | 49732.000000 | 49732.000000 | 49732.000000 | 49732.000000 |
| mean | 40.957472 | 1367.761562 | 15.816315 | 258.690179 | 2.766549 | 40.158630 | 0.576892 |
| std | 10.615008 | 3041.608766 | 8.315680 | 257.743149 | 3.099075 | 100.127123 | 2.254838 |
| min | 18.000000 | -8019.000000 | 1.000000 | 0.000000 | 1.000000 | -1.000000 | 0.000000 |
| 25% | 33.000000 | 72.000000 | 8.000000 | 103.000000 | 1.000000 | -1.000000 | 0.000000 |
| 50% | 39.000000 | 448.000000 | 16.000000 | 180.000000 | 2.000000 | -1.000000 | 0.000000 |
| 75% | 48.000000 | 1431.000000 | 21.000000 | 320.000000 | 3.000000 | -1.000000 | 0.000000 |
| max | 95.000000 | 102127.000000 | 31.000000 | 4918.000000 | 63.000000 | 871.000000 | 275.000000 |
def dist_bivariate(title):
#train = pd.concat([Xtrain,ytrain],axis=1)
sns.FacetGrid(df_bank,hue='subscribed',palette='cubehelix',size=5 ).map(sns.distplot,title).add_legend()
dist_bivariate('age')
C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
dist_bivariate('balance')
C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
dist_bivariate("day")
C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
dist_bivariate("duration")
C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
dist_bivariate('pdays')
C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
dist_bivariate("previous")
C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
In this part, data preparation do before modelling, so we treatment the train data
# Detect duplicate data
train[train.duplicated(keep=False)]
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 37266 | 36 | technician | married | secondary | no | 55 | yes | no | cellular | 13 | may | 38 | 2 | 369 | 4 | failure | no |
| 18403 | 58 | retired | divorced | primary | no | 2538 | yes | no | cellular | 31 | jul | 307 | 6 | -1 | 0 | unknown | no |
| 40391 | 45 | management | married | tertiary | no | 1529 | no | no | cellular | 30 | jun | 160 | 1 | -1 | 0 | unknown | no |
| 47830 | 49 | services | married | secondary | no | 823 | no | no | cellular | 8 | aug | 780 | 4 | -1 | 0 | unknown | yes |
| 48540 | 34 | management | divorced | tertiary | no | 13204 | yes | yes | cellular | 20 | nov | 197 | 2 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 45571 | 36 | management | single | tertiary | no | 339 | no | no | cellular | 21 | nov | 6 | 6 | -1 | 0 | unknown | no |
| 46467 | 55 | services | divorced | secondary | no | 218 | yes | no | unknown | 29 | may | 200 | 2 | -1 | 0 | unknown | no |
| 47560 | 25 | technician | single | secondary | no | -80 | yes | yes | unknown | 3 | jun | 89 | 1 | -1 | 0 | unknown | no |
| 45413 | 35 | technician | single | tertiary | no | 2658 | yes | no | cellular | 19 | aug | 768 | 2 | 141 | 3 | failure | yes |
| 49100 | 47 | management | single | tertiary | no | 4402 | no | no | telephone | 10 | dec | 805 | 1 | 190 | 1 | other | yes |
5780 rows × 17 columns
print("Before drop duplicated data")
train.shape
Before drop duplicated data
(39785, 17)
train = train.drop_duplicates(keep='first')
print("After drop duplicated data")
train.shape
After drop duplicated data
(36895, 17)
#Check again for duplicated data
train[train.duplicated(keep=False)]
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed |
|---|
There are not duplicated rows again after drop duplicated.
Using IQR because from numerical data, the distribution data is not normal distribution
# Outlier Analysis
plt.figure(figsize=(16, 8))
# Create Boxplot
train.boxplot(vert=False)
plt.subplots_adjust(left=0.25)
plt.show()
train.shape
(36895, 17)
Q1 = train["previous"].quantile(0.25)
Q3 = train["previous"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
train = train[(train["previous"] >= low_limit) &
(train["previous"] <= high_limit)
]
train.shape
(30090, 17)
train.shape
(30090, 17)
Q1 = train["pdays"].quantile(0.25)
Q3 = train["pdays"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
train = train[(train["pdays"] >= low_limit) &
(train["pdays"] <= high_limit)
]
train.shape
(30090, 17)
train.shape
(30090, 17)
Q1 = train["campaign"].quantile(0.25)
Q3 = train["campaign"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
train = train[(train["campaign"] >= low_limit) &
(train["campaign"] <= high_limit)
]
train.shape
(27788, 17)
train.shape
(27788, 17)
Q1 = train["duration"].quantile(0.25)
Q3 = train["duration"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
train = train[(train["duration"] >= low_limit) &
(train["duration"] <= high_limit)
]
train.shape
(25709, 17)
train.shape
(25709, 17)
Q1 = train["day"].quantile(0.25)
Q3 = train["day"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
train = train[(train["day"] >= low_limit) &
(train["day"] <= high_limit)
]
train.shape
(25709, 17)
train.shape
(25709, 17)
Q1 = train["balance"].quantile(0.25)
Q3 = train["balance"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
train = train[(train["balance"] >= low_limit) &
(train["balance"] <= high_limit)
]
train.shape
(22955, 17)
train.shape
(22955, 17)
Q1 = train["age"].quantile(0.25)
Q3 = train["age"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
train = train[(train["age"] >= low_limit) &
(train["age"] <= high_limit)
]
train.shape
(22822, 17)
# Outlier Analysis checking again
plt.figure(figsize=(16, 8))
# Create Boxplot
train.boxplot(vert=False)
plt.subplots_adjust(left=0.25)
plt.show()
train
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 823 | 33 | blue-collar | single | primary | no | 95 | yes | no | unknown | 7 | may | 102 | 1 | -1 | 0 | unknown | no |
| 31304 | 19 | student | single | secondary | no | 1803 | no | no | cellular | 10 | mar | 59 | 1 | -1 | 0 | unknown | no |
| 15594 | 25 | blue-collar | married | secondary | no | 895 | no | no | cellular | 21 | jul | 317 | 3 | -1 | 0 | unknown | no |
| 14670 | 45 | management | married | tertiary | no | 0 | no | yes | cellular | 15 | jul | 112 | 4 | -1 | 0 | unknown | no |
| 26345 | 32 | blue-collar | divorced | secondary | no | 658 | no | no | cellular | 20 | nov | 173 | 2 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7751 | 36 | blue-collar | married | secondary | no | 740 | yes | no | unknown | 30 | may | 171 | 2 | -1 | 0 | unknown | no |
| 20609 | 30 | management | single | tertiary | no | 67 | no | no | cellular | 12 | aug | 578 | 4 | -1 | 0 | unknown | no |
| 7813 | 36 | management | single | tertiary | no | 2806 | yes | no | unknown | 30 | may | 358 | 4 | -1 | 0 | unknown | no |
| 43723 | 50 | technician | married | tertiary | no | 1830 | no | no | cellular | 14 | may | 318 | 3 | -1 | 0 | unknown | no |
| 32511 | 34 | management | divorced | tertiary | no | 528 | yes | no | cellular | 17 | apr | 233 | 2 | -1 | 0 | unknown | no |
22822 rows × 17 columns
train.sample(7)
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30656 | 42 | entrepreneur | married | primary | yes | 66 | yes | yes | cellular | 6 | feb | 139 | 1 | -1 | 0 | unknown | no |
| 22944 | 35 | management | married | tertiary | no | 1285 | no | no | cellular | 26 | aug | 100 | 4 | -1 | 0 | unknown | no |
| 22339 | 46 | technician | divorced | tertiary | no | 532 | no | no | cellular | 22 | aug | 491 | 2 | -1 | 0 | unknown | no |
| 11441 | 52 | entrepreneur | divorced | secondary | no | 302 | no | no | unknown | 19 | jun | 169 | 2 | -1 | 0 | unknown | no |
| 32 | 60 | admin. | married | secondary | no | 39 | yes | yes | unknown | 5 | may | 208 | 1 | -1 | 0 | unknown | no |
| 10498 | 38 | services | married | tertiary | no | 1018 | no | no | unknown | 16 | jun | 325 | 3 | -1 | 0 | unknown | no |
| 46856 | 41 | entrepreneur | married | secondary | no | 353 | no | no | cellular | 19 | nov | 90 | 2 | -1 | 0 | unknown | no |
train['n_missing'] = train.isnull().sum(axis=1)
train.sort_values('n_missing',ascending=False)
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | n_missing | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 823 | 33 | blue-collar | single | primary | no | 95 | yes | no | unknown | 7 | may | 102 | 1 | -1 | 0 | unknown | no | 0 |
| 48178 | 46 | blue-collar | divorced | primary | no | 1712 | no | no | unknown | 19 | jun | 110 | 1 | -1 | 0 | unknown | no | 0 |
| 19040 | 52 | admin. | married | primary | no | 813 | no | yes | cellular | 5 | aug | 113 | 2 | -1 | 0 | unknown | no | 0 |
| 9856 | 59 | blue-collar | divorced | primary | no | 1320 | no | no | unknown | 9 | jun | 522 | 1 | -1 | 0 | unknown | no | 0 |
| 26996 | 30 | services | single | secondary | no | 62 | yes | no | cellular | 21 | nov | 76 | 1 | -1 | 0 | unknown | no | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8976 | 50 | unemployed | married | primary | no | 705 | no | no | unknown | 5 | jun | 504 | 1 | -1 | 0 | unknown | yes | 0 |
| 42675 | 50 | admin. | married | secondary | no | 2955 | no | no | cellular | 14 | jan | 166 | 1 | -1 | 0 | unknown | no | 0 |
| 6185 | 33 | blue-collar | married | primary | no | 1217 | yes | no | unknown | 27 | may | 271 | 3 | -1 | 0 | unknown | no | 0 |
| 17605 | 42 | self-employed | married | secondary | no | 2130 | yes | yes | cellular | 29 | jul | 146 | 2 | -1 | 0 | unknown | no | 0 |
| 32511 | 34 | management | divorced | tertiary | no | 528 | yes | no | cellular | 17 | apr | 233 | 2 | -1 | 0 | unknown | no | 0 |
22822 rows × 18 columns
train.drop('n_missing', axis=1, inplace=True)
train
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 823 | 33 | blue-collar | single | primary | no | 95 | yes | no | unknown | 7 | may | 102 | 1 | -1 | 0 | unknown | no |
| 31304 | 19 | student | single | secondary | no | 1803 | no | no | cellular | 10 | mar | 59 | 1 | -1 | 0 | unknown | no |
| 15594 | 25 | blue-collar | married | secondary | no | 895 | no | no | cellular | 21 | jul | 317 | 3 | -1 | 0 | unknown | no |
| 14670 | 45 | management | married | tertiary | no | 0 | no | yes | cellular | 15 | jul | 112 | 4 | -1 | 0 | unknown | no |
| 26345 | 32 | blue-collar | divorced | secondary | no | 658 | no | no | cellular | 20 | nov | 173 | 2 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7751 | 36 | blue-collar | married | secondary | no | 740 | yes | no | unknown | 30 | may | 171 | 2 | -1 | 0 | unknown | no |
| 20609 | 30 | management | single | tertiary | no | 67 | no | no | cellular | 12 | aug | 578 | 4 | -1 | 0 | unknown | no |
| 7813 | 36 | management | single | tertiary | no | 2806 | yes | no | unknown | 30 | may | 358 | 4 | -1 | 0 | unknown | no |
| 43723 | 50 | technician | married | tertiary | no | 1830 | no | no | cellular | 14 | may | 318 | 3 | -1 | 0 | unknown | no |
| 32511 | 34 | management | divorced | tertiary | no | 528 | yes | no | cellular | 17 | apr | 233 | 2 | -1 | 0 | unknown | no |
22822 rows × 17 columns
train.isnull().sum()
age 0 job 0 marital 0 education 0 default 0 balance 0 housing 0 loan 0 contact 0 day 0 month 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 subscribed 0 dtype: int64
train.isna().sum()
age 0 job 0 marital 0 education 0 default 0 balance 0 housing 0 loan 0 contact 0 day 0 month 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 subscribed 0 dtype: int64
train.isnull().sum().sum()
0
train.isnull().any().any()
False
count_nan = train.isnull().sum()
print ('Count of NaN: ' + str(count_nan))
Count of NaN: age 0 job 0 marital 0 education 0 default 0 balance 0 housing 0 loan 0 contact 0 day 0 month 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 subscribed 0 dtype: int64
for column in train.columns:
print(f"============= {column} =================")
display(train[column].value_counts())
print()
============= age =================
32 1057 31 1029 33 995 35 968 34 953 36 925 30 904 37 833 39 732 38 710 40 681 41 669 42 648 29 627 45 617 46 612 44 598 43 577 47 559 28 558 49 545 48 509 51 501 27 495 52 494 50 491 26 437 53 437 57 408 56 404 54 400 55 395 59 380 58 359 25 303 60 270 24 162 23 103 22 71 61 61 21 51 63 34 64 30 20 26 66 25 62 23 65 22 70 22 67 20 71 19 69 19 72 18 19 17 68 10 18 9 Name: age, dtype: int64
============= job =================
blue-collar 5114 management 4532 technician 3871 admin. 2624 services 2262 retired 964 entrepreneur 793 self-employed 775 unemployed 665 housemaid 649 student 433 unknown 140 Name: job, dtype: int64
============= marital =================
married 13653 single 6458 divorced 2711 Name: marital, dtype: int64
============= education =================
secondary 12099 tertiary 6283 primary 3567 unknown 873 Name: education, dtype: int64
============= default =================
no 22308 yes 514 Name: default, dtype: int64
============= balance =================
0 2087
1 123
4 83
2 82
3 78
...
1556 1
-651 1
-692 1
-1164 1
2971 1
Name: balance, Length: 3707, dtype: int64
============= housing =================
yes 12639 no 10183 Name: housing, dtype: int64
============= loan =================
no 18815 yes 4007 Name: loan, dtype: int64
============= contact =================
cellular 13436 unknown 8186 telephone 1200 Name: contact, dtype: int64
============= day =================
20 1287 18 1086 6 1054 8 1033 5 1018 7 999 21 982 14 958 28 945 17 937 9 916 29 911 19 873 30 780 12 769 15 752 16 714 13 708 11 700 4 690 2 623 27 601 26 561 23 541 3 517 22 502 25 430 10 276 31 275 24 251 1 133 Name: day, dtype: int64
============= month =================
may 7434 jul 4124 aug 3290 jun 2933 nov 1575 apr 1129 feb 1108 jan 619 oct 211 mar 175 sep 161 dec 63 Name: month, dtype: int64
============= duration =================
123 106
104 106
90 105
121 104
112 103
...
598 2
0 2
586 2
596 2
639 2
Name: duration, Length: 646, dtype: int64
============= campaign =================
1 9085 2 6849 3 2989 4 2090 5 1038 6 771 Name: campaign, dtype: int64
============= pdays =================
-1 22822 Name: pdays, dtype: int64
============= previous =================
0 22822 Name: previous, dtype: int64
============= poutcome =================
unknown 22822 Name: poutcome, dtype: int64
============= subscribed =================
no 21514 yes 1308 Name: subscribed, dtype: int64
From the result code above, we saw that there are also missing value in several columns. Below is the list of all missing value for each column :
balance: average yearly balance, if the value is negative because banks charge fees when this happens. And your bank could close your account if it stays negative for too long. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
Now we know what are the missing values contained in each of the columns. Next, we have to decide how should we act on those columns that have missing values. Should we drop that column? Or should we impute the missing value?
First, we have to see what's the percentage of missing value in that column. If it contained a lot of missing value (let say >35%), then we can just drop that column. If not, then we can do some imputation.
100 * len(train[train['job'].isin(['unknown'])]) / len(train)
0.6134431688721409
100 * len(train[train['education'].isin(['unknown'])]) / len(train)
3.8252563316098502
100 * len(train[train['contact'].isin(['unknown'])]) / len(train)
35.86889843133818
100 * len(train[train['poutcome'].isin(['unknown'])]) / len(train)
100.0
so there are 2 columns that have missing value >35%, they are contact and poutcome. But, for me contact column is important to analyze because of in this dataset, the marketing target way is only using celullar and telephone so we mustn't to drop it since the % missing value near 35%. So, we just drop poutcome column. I hope it's wise choice to imputing another column with unknown value.
train = train.drop(columns=['poutcome'])
train.sample(5)
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 16797 | 45 | admin. | single | secondary | no | 757 | yes | yes | cellular | 24 | jul | 144 | 3 | -1 | 0 | no |
| 37937 | 47 | admin. | married | secondary | no | 1863 | yes | yes | cellular | 14 | may | 151 | 4 | -1 | 0 | no |
| 47291 | 34 | services | divorced | secondary | no | 44 | no | no | unknown | 9 | jun | 118 | 1 | -1 | 0 | no |
| 912 | 47 | management | divorced | tertiary | no | 34 | yes | no | unknown | 7 | may | 104 | 1 | -1 | 0 | no |
| 28398 | 37 | unemployed | married | secondary | no | 1152 | no | no | cellular | 29 | jan | 222 | 1 | -1 | 0 | no |
# Mode imputation on job column
mode_job = train[~train['job'].isin(['unknown'])]['job'].mode()[0]
train['job'] = train['job'].apply(lambda x: mode_job if x in ['unknown'] else x)
mode_job
'blue-collar'
train['job'].value_counts()
blue-collar 5254 management 4532 technician 3871 admin. 2624 services 2262 retired 964 entrepreneur 793 self-employed 775 unemployed 665 housemaid 649 student 433 Name: job, dtype: int64
# Mode imputation on education column
mode_education = train[~train['education'].isin(['unknown'])]['education'].mode()[0]
train['education'] = train['education'].apply(lambda x: mode_education if x in ['unknown'] else x)
mode_education
'secondary'
train['education'].value_counts()
secondary 12972 tertiary 6283 primary 3567 Name: education, dtype: int64
# Mode imputation on contact column
mode_contact = train[~train['contact'].isin(['unknown'])]['contact'].mode()[0]
train['contact'] = train['contact'].apply(lambda x: mode_contact if x in ['unknown'] else x)
mode_contact
'cellular'
train['contact'].value_counts()
cellular 21622 telephone 1200 Name: contact, dtype: int64
for column in train.columns:
print(f"============= {column} =================")
display(train[column].value_counts())
print()
============= age =================
32 1057 31 1029 33 995 35 968 34 953 36 925 30 904 37 833 39 732 38 710 40 681 41 669 42 648 29 627 45 617 46 612 44 598 43 577 47 559 28 558 49 545 48 509 51 501 27 495 52 494 50 491 26 437 53 437 57 408 56 404 54 400 55 395 59 380 58 359 25 303 60 270 24 162 23 103 22 71 61 61 21 51 63 34 64 30 20 26 66 25 62 23 65 22 70 22 67 20 71 19 69 19 72 18 19 17 68 10 18 9 Name: age, dtype: int64
============= job =================
blue-collar 5254 management 4532 technician 3871 admin. 2624 services 2262 retired 964 entrepreneur 793 self-employed 775 unemployed 665 housemaid 649 student 433 Name: job, dtype: int64
============= marital =================
married 13653 single 6458 divorced 2711 Name: marital, dtype: int64
============= education =================
secondary 12972 tertiary 6283 primary 3567 Name: education, dtype: int64
============= default =================
no 22308 yes 514 Name: default, dtype: int64
============= balance =================
0 2087
1 123
4 83
2 82
3 78
...
1556 1
-651 1
-692 1
-1164 1
2971 1
Name: balance, Length: 3707, dtype: int64
============= housing =================
yes 12639 no 10183 Name: housing, dtype: int64
============= loan =================
no 18815 yes 4007 Name: loan, dtype: int64
============= contact =================
cellular 21622 telephone 1200 Name: contact, dtype: int64
============= day =================
20 1287 18 1086 6 1054 8 1033 5 1018 7 999 21 982 14 958 28 945 17 937 9 916 29 911 19 873 30 780 12 769 15 752 16 714 13 708 11 700 4 690 2 623 27 601 26 561 23 541 3 517 22 502 25 430 10 276 31 275 24 251 1 133 Name: day, dtype: int64
============= month =================
may 7434 jul 4124 aug 3290 jun 2933 nov 1575 apr 1129 feb 1108 jan 619 oct 211 mar 175 sep 161 dec 63 Name: month, dtype: int64
============= duration =================
123 106
104 106
90 105
121 104
112 103
...
598 2
0 2
586 2
596 2
639 2
Name: duration, Length: 646, dtype: int64
============= campaign =================
1 9085 2 6849 3 2989 4 2090 5 1038 6 771 Name: campaign, dtype: int64
============= pdays =================
-1 22822 Name: pdays, dtype: int64
============= previous =================
0 22822 Name: previous, dtype: int64
============= subscribed =================
no 21514 yes 1308 Name: subscribed, dtype: int64
train
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 823 | 33 | blue-collar | single | primary | no | 95 | yes | no | cellular | 7 | may | 102 | 1 | -1 | 0 | no |
| 31304 | 19 | student | single | secondary | no | 1803 | no | no | cellular | 10 | mar | 59 | 1 | -1 | 0 | no |
| 15594 | 25 | blue-collar | married | secondary | no | 895 | no | no | cellular | 21 | jul | 317 | 3 | -1 | 0 | no |
| 14670 | 45 | management | married | tertiary | no | 0 | no | yes | cellular | 15 | jul | 112 | 4 | -1 | 0 | no |
| 26345 | 32 | blue-collar | divorced | secondary | no | 658 | no | no | cellular | 20 | nov | 173 | 2 | -1 | 0 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7751 | 36 | blue-collar | married | secondary | no | 740 | yes | no | cellular | 30 | may | 171 | 2 | -1 | 0 | no |
| 20609 | 30 | management | single | tertiary | no | 67 | no | no | cellular | 12 | aug | 578 | 4 | -1 | 0 | no |
| 7813 | 36 | management | single | tertiary | no | 2806 | yes | no | cellular | 30 | may | 358 | 4 | -1 | 0 | no |
| 43723 | 50 | technician | married | tertiary | no | 1830 | no | no | cellular | 14 | may | 318 | 3 | -1 | 0 | no |
| 32511 | 34 | management | divorced | tertiary | no | 528 | yes | no | cellular | 17 | apr | 233 | 2 | -1 | 0 | no |
22822 rows × 16 columns
After we doing the missing value handling and imputation, we do the imputation and drop poutcome in data test, we inject the information from imputation data train to data test. we do it for the predict in data test after modelling. Do the same for data preparation modelling in data test but when imputate it reference to treatment in data train.
In this part, data preparation do before modelling, so we treatment the train data
test
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | blue-collar | single | primary | no | 147 | yes | no | unknown | 2 | jun | 102 | 2 | -1 | 0 | unknown | no |
| 21927 | 32 | technician | married | secondary | no | 20 | no | no | cellular | 20 | aug | 210 | 2 | -1 | 0 | unknown | no |
| 9756 | 34 | housemaid | married | tertiary | no | 5011 | yes | no | unknown | 9 | jun | 311 | 2 | -1 | 0 | unknown | no |
| 14416 | 47 | technician | married | secondary | no | 0 | no | no | telephone | 14 | jul | 81 | 2 | -1 | 0 | unknown | no |
| 35457 | 48 | management | divorced | secondary | no | 122 | yes | no | telephone | 7 | may | 325 | 1 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2402 | 23 | admin. | single | secondary | no | 281 | yes | no | unknown | 13 | may | 124 | 1 | -1 | 0 | unknown | no |
| 6976 | 51 | technician | divorced | secondary | no | 0 | yes | no | unknown | 28 | may | 463 | 2 | -1 | 0 | unknown | no |
| 7235 | 51 | management | married | tertiary | no | 0 | yes | yes | unknown | 29 | may | 410 | 5 | -1 | 0 | unknown | no |
| 16819 | 39 | blue-collar | married | secondary | no | -66 | no | yes | telephone | 24 | jul | 626 | 5 | -1 | 0 | unknown | no |
| 12936 | 31 | management | single | tertiary | no | 1598 | yes | no | cellular | 7 | jul | 36 | 33 | -1 | 0 | unknown | no |
9947 rows × 17 columns
# Detect duplicate data
test[test.duplicated(keep=False)]
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 48894 | 42 | blue-collar | married | secondary | no | 179 | yes | no | cellular | 17 | nov | 96 | 1 | -1 | 0 | unknown | no |
| 45252 | 36 | management | single | tertiary | no | 553 | no | no | cellular | 11 | aug | 106 | 2 | -1 | 0 | unknown | no |
| 48236 | 38 | management | single | tertiary | no | 16957 | yes | no | telephone | 29 | jan | 208 | 1 | 247 | 2 | failure | no |
| 49728 | 57 | self-employed | married | tertiary | yes | -3313 | yes | yes | unknown | 9 | may | 153 | 1 | -1 | 0 | unknown | no |
| 23739 | 49 | management | single | tertiary | no | 588 | no | no | cellular | 28 | aug | 70 | 9 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35119 | 39 | technician | married | secondary | no | 147 | yes | no | cellular | 6 | may | 151 | 2 | -1 | 0 | unknown | no |
| 34661 | 59 | blue-collar | married | secondary | no | 5314 | yes | no | cellular | 5 | may | 200 | 3 | -1 | 0 | unknown | no |
| 36797 | 44 | blue-collar | married | primary | no | 887 | yes | no | cellular | 12 | may | 165 | 2 | 363 | 1 | failure | no |
| 46645 | 34 | admin. | single | secondary | no | 846 | yes | no | cellular | 17 | apr | 758 | 2 | -1 | 0 | unknown | no |
| 25812 | 45 | blue-collar | married | secondary | no | 2926 | yes | yes | cellular | 19 | nov | 482 | 3 | 120 | 2 | failure | no |
360 rows × 17 columns
print("Before drop duplicated data test")
test.shape
Before drop duplicated data test
(9947, 17)
test = test.drop_duplicates(keep='first')
print("After drop duplicated data test")
test.shape
After drop duplicated data test
(9767, 17)
#Check again for duplicated data
test[test.duplicated(keep=False)]
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | subscribed |
|---|
There are not duplicated rows again in data test after drop duplicated data. Drop the poutcome feature because we drop it in data train.
test = test.drop(columns=['poutcome'])
test
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | blue-collar | single | primary | no | 147 | yes | no | unknown | 2 | jun | 102 | 2 | -1 | 0 | no |
| 21927 | 32 | technician | married | secondary | no | 20 | no | no | cellular | 20 | aug | 210 | 2 | -1 | 0 | no |
| 9756 | 34 | housemaid | married | tertiary | no | 5011 | yes | no | unknown | 9 | jun | 311 | 2 | -1 | 0 | no |
| 14416 | 47 | technician | married | secondary | no | 0 | no | no | telephone | 14 | jul | 81 | 2 | -1 | 0 | no |
| 35457 | 48 | management | divorced | secondary | no | 122 | yes | no | telephone | 7 | may | 325 | 1 | -1 | 0 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2402 | 23 | admin. | single | secondary | no | 281 | yes | no | unknown | 13 | may | 124 | 1 | -1 | 0 | no |
| 6976 | 51 | technician | divorced | secondary | no | 0 | yes | no | unknown | 28 | may | 463 | 2 | -1 | 0 | no |
| 7235 | 51 | management | married | tertiary | no | 0 | yes | yes | unknown | 29 | may | 410 | 5 | -1 | 0 | no |
| 16819 | 39 | blue-collar | married | secondary | no | -66 | no | yes | telephone | 24 | jul | 626 | 5 | -1 | 0 | no |
| 12936 | 31 | management | single | tertiary | no | 1598 | yes | no | cellular | 7 | jul | 36 | 33 | -1 | 0 | no |
9767 rows × 16 columns
Using IQR
# Outlier Analysis
plt.figure(figsize=(16, 8))
# Create Boxplot
test.boxplot(vert=False)
plt.subplots_adjust(left=0.25)
plt.show()
print("Before remove outliers",test.shape)
Before remove outliers (9767, 16)
Q1 = test["previous"].quantile(0.25)
Q3 = test["previous"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
test = test[(test["previous"] >= low_limit) &
(test["previous"] <= high_limit)
]
print("After remove outliers",test.shape)
After remove outliers (8066, 16)
print("Before remove outliers",test.shape)
Before remove outliers (8066, 16)
Q1 = test["pdays"].quantile(0.25)
Q3 = test["pdays"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
test = test[(test["pdays"] >= low_limit) &
(test["pdays"] <= high_limit)
]
print("After remove outliers",test.shape)
After remove outliers (8066, 16)
print("Before remove outliers",test.shape)
Before remove outliers (8066, 16)
Q1 = test["campaign"].quantile(0.25)
Q3 = test["campaign"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
test = test[(test["campaign"] >= low_limit) &
(test["campaign"] <= high_limit)
]
print("After remove outliers",test.shape)
After remove outliers (7429, 16)
print("Before remove outliers",test.shape)
Before remove outliers (7429, 16)
Q1 = test["duration"].quantile(0.25)
Q3 = test["duration"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
test = test[(test["duration"] >= low_limit) &
(test["duration"] <= high_limit)
]
print("After remove outliers",test.shape)
After remove outliers (6924, 16)
print("Before remove outliers",test.shape)
Before remove outliers (6924, 16)
Q1 = test["day"].quantile(0.25)
Q3 = test["day"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
test = test[(test["day"] >= low_limit) &
(test["day"] <= high_limit)
]
print("After remove outliers",test.shape)
After remove outliers (6924, 16)
print("Before remove outliers",test.shape)
Before remove outliers (6924, 16)
Q1 = test["balance"].quantile(0.25)
Q3 = test["balance"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
test = test[(test["balance"] >= low_limit) &
(test["balance"] <= high_limit)
]
print("After remove outliers",test.shape)
After remove outliers (6186, 16)
print("Before remove outliers",test.shape)
Before remove outliers (6186, 16)
Q1 = test["age"].quantile(0.25)
Q3 = test["age"].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (IQR * 1.5)
high_limit = Q3 + (IQR * 1.5)
test = test[(test["age"] >= low_limit) &
(test["age"] <= high_limit)
]
print("After remove outliers",test.shape)
After remove outliers (6141, 16)
# Outlier Analysis checking again
plt.figure(figsize=(16, 8))
# Create Boxplot
test.boxplot(vert=False)
plt.subplots_adjust(left=0.25)
plt.show()
test
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | blue-collar | single | primary | no | 147 | yes | no | unknown | 2 | jun | 102 | 2 | -1 | 0 | no |
| 21927 | 32 | technician | married | secondary | no | 20 | no | no | cellular | 20 | aug | 210 | 2 | -1 | 0 | no |
| 14416 | 47 | technician | married | secondary | no | 0 | no | no | telephone | 14 | jul | 81 | 2 | -1 | 0 | no |
| 35457 | 48 | management | divorced | secondary | no | 122 | yes | no | telephone | 7 | may | 325 | 1 | -1 | 0 | no |
| 30863 | 52 | management | married | tertiary | no | 749 | no | yes | cellular | 9 | feb | 342 | 2 | -1 | 0 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26293 | 54 | self-employed | married | primary | no | 2132 | no | yes | cellular | 20 | nov | 91 | 1 | -1 | 0 | no |
| 2402 | 23 | admin. | single | secondary | no | 281 | yes | no | unknown | 13 | may | 124 | 1 | -1 | 0 | no |
| 6976 | 51 | technician | divorced | secondary | no | 0 | yes | no | unknown | 28 | may | 463 | 2 | -1 | 0 | no |
| 7235 | 51 | management | married | tertiary | no | 0 | yes | yes | unknown | 29 | may | 410 | 5 | -1 | 0 | no |
| 16819 | 39 | blue-collar | married | secondary | no | -66 | no | yes | telephone | 24 | jul | 626 | 5 | -1 | 0 | no |
6141 rows × 16 columns
test['n_missing'] = test.isnull().sum(axis=1)
test.sort_values('n_missing',ascending=False)
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | subscribed | n_missing | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | blue-collar | single | primary | no | 147 | yes | no | unknown | 2 | jun | 102 | 2 | -1 | 0 | no | 0 |
| 39908 | 31 | student | single | tertiary | no | 64 | no | no | cellular | 2 | jun | 261 | 2 | -1 | 0 | yes | 0 |
| 21579 | 32 | management | married | tertiary | no | 644 | no | no | cellular | 19 | aug | 100 | 2 | -1 | 0 | no | 0 |
| 28748 | 31 | services | married | secondary | no | 288 | no | no | cellular | 30 | jan | 164 | 1 | -1 | 0 | no | 0 |
| 3552 | 25 | blue-collar | single | secondary | no | 0 | yes | no | unknown | 15 | may | 244 | 2 | -1 | 0 | no | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 46336 | 38 | blue-collar | divorced | secondary | no | 2998 | yes | no | unknown | 11 | jun | 91 | 1 | -1 | 0 | no | 0 |
| 39743 | 25 | technician | married | secondary | no | 517 | no | no | cellular | 28 | may | 135 | 3 | -1 | 0 | no | 0 |
| 30298 | 37 | technician | single | secondary | no | 123 | no | yes | cellular | 5 | feb | 136 | 1 | -1 | 0 | no | 0 |
| 25507 | 37 | services | married | secondary | no | 345 | yes | yes | cellular | 19 | nov | 204 | 1 | -1 | 0 | no | 0 |
| 16819 | 39 | blue-collar | married | secondary | no | -66 | no | yes | telephone | 24 | jul | 626 | 5 | -1 | 0 | no | 0 |
6141 rows × 17 columns
test.drop('n_missing', axis=1, inplace=True)
test
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | blue-collar | single | primary | no | 147 | yes | no | unknown | 2 | jun | 102 | 2 | -1 | 0 | no |
| 21927 | 32 | technician | married | secondary | no | 20 | no | no | cellular | 20 | aug | 210 | 2 | -1 | 0 | no |
| 14416 | 47 | technician | married | secondary | no | 0 | no | no | telephone | 14 | jul | 81 | 2 | -1 | 0 | no |
| 35457 | 48 | management | divorced | secondary | no | 122 | yes | no | telephone | 7 | may | 325 | 1 | -1 | 0 | no |
| 30863 | 52 | management | married | tertiary | no | 749 | no | yes | cellular | 9 | feb | 342 | 2 | -1 | 0 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26293 | 54 | self-employed | married | primary | no | 2132 | no | yes | cellular | 20 | nov | 91 | 1 | -1 | 0 | no |
| 2402 | 23 | admin. | single | secondary | no | 281 | yes | no | unknown | 13 | may | 124 | 1 | -1 | 0 | no |
| 6976 | 51 | technician | divorced | secondary | no | 0 | yes | no | unknown | 28 | may | 463 | 2 | -1 | 0 | no |
| 7235 | 51 | management | married | tertiary | no | 0 | yes | yes | unknown | 29 | may | 410 | 5 | -1 | 0 | no |
| 16819 | 39 | blue-collar | married | secondary | no | -66 | no | yes | telephone | 24 | jul | 626 | 5 | -1 | 0 | no |
6141 rows × 16 columns
test.isnull().sum()
age 0 job 0 marital 0 education 0 default 0 balance 0 housing 0 loan 0 contact 0 day 0 month 0 duration 0 campaign 0 pdays 0 previous 0 subscribed 0 dtype: int64
test.isna().sum()
age 0 job 0 marital 0 education 0 default 0 balance 0 housing 0 loan 0 contact 0 day 0 month 0 duration 0 campaign 0 pdays 0 previous 0 subscribed 0 dtype: int64
count_nan_in_test = test.isnull().sum().sum()
print ('Count of NaN: ' + str(count_nan_in_test))
Count of NaN: 0
check_nan_in_test = test.isnull().values.any()
print (check_nan_in_test)
False
for column in test.columns:
print(f"============= {column} =================")
display(test[column].value_counts())
print()
============= age =================
31 309 32 283 34 263 36 258 33 252 35 250 30 240 37 226 39 218 38 213 42 180 46 180 45 177 40 176 29 176 41 167 43 160 47 153 44 148 53 136 49 135 48 132 52 131 51 127 55 126 57 122 28 117 50 117 27 113 56 111 54 106 26 105 59 105 58 103 60 78 25 78 24 48 23 23 22 18 61 13 21 8 62 8 63 7 20 7 65 6 67 6 66 6 64 6 19 5 69 4 68 3 70 2 Name: age, dtype: int64
============= job =================
blue-collar 1426 management 1204 technician 1069 admin. 715 services 554 retired 249 self-employed 216 entrepreneur 196 housemaid 188 unemployed 178 student 110 unknown 36 Name: job, dtype: int64
============= marital =================
married 3781 single 1625 divorced 735 Name: marital, dtype: int64
============= education =================
secondary 3184 tertiary 1731 primary 961 unknown 265 Name: education, dtype: int64
============= default =================
no 6006 yes 135 Name: default, dtype: int64
============= balance =================
0 589
1 30
2 28
3 24
4 21
...
663 1
-296 1
1526 1
645 1
-66 1
Name: balance, Length: 2280, dtype: int64
============= housing =================
yes 3427 no 2714 Name: housing, dtype: int64
============= loan =================
no 5049 yes 1092 Name: loan, dtype: int64
============= contact =================
cellular 3505 unknown 2301 telephone 335 Name: contact, dtype: int64
============= day =================
20 343 8 297 7 287 18 275 21 271 19 264 6 260 5 255 28 252 29 251 17 251 14 244 9 239 30 221 13 214 15 204 12 199 4 199 16 198 11 180 27 178 2 155 26 153 23 149 3 138 22 129 25 103 10 73 31 70 24 62 1 27 Name: day, dtype: int64
============= month =================
may 2066 jul 1077 aug 872 jun 819 nov 392 apr 302 feb 275 jan 189 oct 63 mar 43 sep 32 dec 11 Name: month, dtype: int64
============= duration =================
81 35
88 34
124 32
136 31
126 31
..
527 1
612 1
552 1
568 1
603 1
Name: duration, Length: 625, dtype: int64
============= campaign =================
1 2414 2 1809 3 899 4 547 5 273 6 199 Name: campaign, dtype: int64
============= pdays =================
-1 6141 Name: pdays, dtype: int64
============= previous =================
0 6141 Name: previous, dtype: int64
============= subscribed =================
no 5796 yes 345 Name: subscribed, dtype: int64
From the result code above, we saw that there are also missing value in several columns. Below is the list of all missing value for each column :
Because of poutcome column has been dropped. balance: average yearly balance, if the value is negative because banks charge fees when this happens. And your bank could close your account if it stays negative for too long. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
Now we know what are the missing values contained in each of the columns. Next, we have to decide how should we act on those columns that have missing values. Should we drop that column? Or should we impute the missing value?
First, we have to see what's the percentage of missing value in that column. If it contained a lot of missing value (let say >35%), then we can just drop that column. If not, then we can do some imputation.
100 * len(test[test['job'].isin(['unknown'])]) / len(test)
0.5862237420615535
100 * len(test[test['education'].isin(['unknown'])]) / len(test)
4.315258101286435
100 * len(test[test['contact'].isin(['unknown'])]) / len(test)
37.469467513434296
for me contact column is important to analyze because of in this dataset, the marketing target way is only using celullar and telephone so we mustn't to drop it since the % missing value near 35%. So, we just drop poutcome column. I hope it's wise choice to imputing another column with unknown value.
test['job'] = test['job'].apply(lambda x: mode_job if x in ['unknown'] else x)
test['job'].value_counts()
blue-collar 1462 management 1204 technician 1069 admin. 715 services 554 retired 249 self-employed 216 entrepreneur 196 housemaid 188 unemployed 178 student 110 Name: job, dtype: int64
test['education'] = test['education'].apply(lambda x: mode_education if x in ['unknown'] else x)
test['education'].value_counts()
secondary 3449 tertiary 1731 primary 961 Name: education, dtype: int64
test['contact'] = test['contact'].apply(lambda x: mode_contact if x in ['unknown'] else x)
test['contact'].value_counts()
cellular 5806 telephone 335 Name: contact, dtype: int64
for column in test.columns:
print(f"============= {column} =================")
display(test[column].value_counts())
print()
============= age =================
31 309 32 283 34 263 36 258 33 252 35 250 30 240 37 226 39 218 38 213 42 180 46 180 45 177 40 176 29 176 41 167 43 160 47 153 44 148 53 136 49 135 48 132 52 131 51 127 55 126 57 122 28 117 50 117 27 113 56 111 54 106 26 105 59 105 58 103 60 78 25 78 24 48 23 23 22 18 61 13 21 8 62 8 63 7 20 7 65 6 67 6 66 6 64 6 19 5 69 4 68 3 70 2 Name: age, dtype: int64
============= job =================
blue-collar 1462 management 1204 technician 1069 admin. 715 services 554 retired 249 self-employed 216 entrepreneur 196 housemaid 188 unemployed 178 student 110 Name: job, dtype: int64
============= marital =================
married 3781 single 1625 divorced 735 Name: marital, dtype: int64
============= education =================
secondary 3449 tertiary 1731 primary 961 Name: education, dtype: int64
============= default =================
no 6006 yes 135 Name: default, dtype: int64
============= balance =================
0 589
1 30
2 28
3 24
4 21
...
663 1
-296 1
1526 1
645 1
-66 1
Name: balance, Length: 2280, dtype: int64
============= housing =================
yes 3427 no 2714 Name: housing, dtype: int64
============= loan =================
no 5049 yes 1092 Name: loan, dtype: int64
============= contact =================
cellular 5806 telephone 335 Name: contact, dtype: int64
============= day =================
20 343 8 297 7 287 18 275 21 271 19 264 6 260 5 255 28 252 29 251 17 251 14 244 9 239 30 221 13 214 15 204 12 199 4 199 16 198 11 180 27 178 2 155 26 153 23 149 3 138 22 129 25 103 10 73 31 70 24 62 1 27 Name: day, dtype: int64
============= month =================
may 2066 jul 1077 aug 872 jun 819 nov 392 apr 302 feb 275 jan 189 oct 63 mar 43 sep 32 dec 11 Name: month, dtype: int64
============= duration =================
81 35
88 34
124 32
136 31
126 31
..
527 1
612 1
552 1
568 1
603 1
Name: duration, Length: 625, dtype: int64
============= campaign =================
1 2414 2 1809 3 899 4 547 5 273 6 199 Name: campaign, dtype: int64
============= pdays =================
-1 6141 Name: pdays, dtype: int64
============= previous =================
0 6141 Name: previous, dtype: int64
============= subscribed =================
no 5796 yes 345 Name: subscribed, dtype: int64
test
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | blue-collar | single | primary | no | 147 | yes | no | cellular | 2 | jun | 102 | 2 | -1 | 0 | no |
| 21927 | 32 | technician | married | secondary | no | 20 | no | no | cellular | 20 | aug | 210 | 2 | -1 | 0 | no |
| 14416 | 47 | technician | married | secondary | no | 0 | no | no | telephone | 14 | jul | 81 | 2 | -1 | 0 | no |
| 35457 | 48 | management | divorced | secondary | no | 122 | yes | no | telephone | 7 | may | 325 | 1 | -1 | 0 | no |
| 30863 | 52 | management | married | tertiary | no | 749 | no | yes | cellular | 9 | feb | 342 | 2 | -1 | 0 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26293 | 54 | self-employed | married | primary | no | 2132 | no | yes | cellular | 20 | nov | 91 | 1 | -1 | 0 | no |
| 2402 | 23 | admin. | single | secondary | no | 281 | yes | no | cellular | 13 | may | 124 | 1 | -1 | 0 | no |
| 6976 | 51 | technician | divorced | secondary | no | 0 | yes | no | cellular | 28 | may | 463 | 2 | -1 | 0 | no |
| 7235 | 51 | management | married | tertiary | no | 0 | yes | yes | cellular | 29 | may | 410 | 5 | -1 | 0 | no |
| 16819 | 39 | blue-collar | married | secondary | no | -66 | no | yes | telephone | 24 | jul | 626 | 5 | -1 | 0 | no |
6141 rows × 16 columns
train
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | subscribed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 823 | 33 | blue-collar | single | primary | no | 95 | yes | no | cellular | 7 | may | 102 | 1 | -1 | 0 | no |
| 31304 | 19 | student | single | secondary | no | 1803 | no | no | cellular | 10 | mar | 59 | 1 | -1 | 0 | no |
| 15594 | 25 | blue-collar | married | secondary | no | 895 | no | no | cellular | 21 | jul | 317 | 3 | -1 | 0 | no |
| 14670 | 45 | management | married | tertiary | no | 0 | no | yes | cellular | 15 | jul | 112 | 4 | -1 | 0 | no |
| 26345 | 32 | blue-collar | divorced | secondary | no | 658 | no | no | cellular | 20 | nov | 173 | 2 | -1 | 0 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7751 | 36 | blue-collar | married | secondary | no | 740 | yes | no | cellular | 30 | may | 171 | 2 | -1 | 0 | no |
| 20609 | 30 | management | single | tertiary | no | 67 | no | no | cellular | 12 | aug | 578 | 4 | -1 | 0 | no |
| 7813 | 36 | management | single | tertiary | no | 2806 | yes | no | cellular | 30 | may | 358 | 4 | -1 | 0 | no |
| 43723 | 50 | technician | married | tertiary | no | 1830 | no | no | cellular | 14 | may | 318 | 3 | -1 | 0 | no |
| 32511 | 34 | management | divorced | tertiary | no | 528 | yes | no | cellular | 17 | apr | 233 | 2 | -1 | 0 | no |
22822 rows × 16 columns
#train dataset
Xtrain = train.iloc[:,:-1] #drop the last column
ytrain = train.iloc[:,-1] #choose the last column
#test dataset
Xtest = test.iloc[:,:-1] #drop the last column
ytest = test.iloc[:,-1] #choose the last column
print('There are {} samples in the training set and {} samples in the test set.'.format(
Xtrain.shape[0] , Xtest.shape[0]))
print()
There are 22822 samples in the training set and 6141 samples in the test set.
Xtrain
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 823 | 33 | blue-collar | single | primary | no | 95 | yes | no | cellular | 7 | may | 102 | 1 | -1 | 0 |
| 31304 | 19 | student | single | secondary | no | 1803 | no | no | cellular | 10 | mar | 59 | 1 | -1 | 0 |
| 15594 | 25 | blue-collar | married | secondary | no | 895 | no | no | cellular | 21 | jul | 317 | 3 | -1 | 0 |
| 14670 | 45 | management | married | tertiary | no | 0 | no | yes | cellular | 15 | jul | 112 | 4 | -1 | 0 |
| 26345 | 32 | blue-collar | divorced | secondary | no | 658 | no | no | cellular | 20 | nov | 173 | 2 | -1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7751 | 36 | blue-collar | married | secondary | no | 740 | yes | no | cellular | 30 | may | 171 | 2 | -1 | 0 |
| 20609 | 30 | management | single | tertiary | no | 67 | no | no | cellular | 12 | aug | 578 | 4 | -1 | 0 |
| 7813 | 36 | management | single | tertiary | no | 2806 | yes | no | cellular | 30 | may | 358 | 4 | -1 | 0 |
| 43723 | 50 | technician | married | tertiary | no | 1830 | no | no | cellular | 14 | may | 318 | 3 | -1 | 0 |
| 32511 | 34 | management | divorced | tertiary | no | 528 | yes | no | cellular | 17 | apr | 233 | 2 | -1 | 0 |
22822 rows × 15 columns
ytrain
823 no
31304 no
15594 no
14670 no
26345 no
..
7751 no
20609 no
7813 no
43723 no
32511 no
Name: subscribed, Length: 22822, dtype: object
Xtest
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | blue-collar | single | primary | no | 147 | yes | no | cellular | 2 | jun | 102 | 2 | -1 | 0 |
| 21927 | 32 | technician | married | secondary | no | 20 | no | no | cellular | 20 | aug | 210 | 2 | -1 | 0 |
| 14416 | 47 | technician | married | secondary | no | 0 | no | no | telephone | 14 | jul | 81 | 2 | -1 | 0 |
| 35457 | 48 | management | divorced | secondary | no | 122 | yes | no | telephone | 7 | may | 325 | 1 | -1 | 0 |
| 30863 | 52 | management | married | tertiary | no | 749 | no | yes | cellular | 9 | feb | 342 | 2 | -1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26293 | 54 | self-employed | married | primary | no | 2132 | no | yes | cellular | 20 | nov | 91 | 1 | -1 | 0 |
| 2402 | 23 | admin. | single | secondary | no | 281 | yes | no | cellular | 13 | may | 124 | 1 | -1 | 0 |
| 6976 | 51 | technician | divorced | secondary | no | 0 | yes | no | cellular | 28 | may | 463 | 2 | -1 | 0 |
| 7235 | 51 | management | married | tertiary | no | 0 | yes | yes | cellular | 29 | may | 410 | 5 | -1 | 0 |
| 16819 | 39 | blue-collar | married | secondary | no | -66 | no | yes | telephone | 24 | jul | 626 | 5 | -1 | 0 |
6141 rows × 15 columns
ytest
8165 no
21927 no
14416 no
35457 no
30863 no
..
26293 no
2402 no
6976 no
7235 no
16819 no
Name: subscribed, Length: 6141, dtype: object
#Categorical Data
train_house = pd.crosstab(Xtrain.housing,ytrain)
train_loan = pd.crosstab(Xtrain.loan,ytrain)
train_contact = pd.crosstab(Xtrain.contact,ytrain)
train_month = pd.crosstab(Xtrain.month,ytrain)
train_job = pd.crosstab(Xtrain.job,ytrain)
train_default = pd.crosstab(Xtrain.default,ytrain)
train_marital = pd.crosstab(Xtrain.marital,ytrain)
train_edcucation = pd.crosstab(Xtrain.education,ytrain)
#Numerical Data
train_age = pd.crosstab(Xtrain.age,ytrain)
train_balance = pd.crosstab(Xtrain.balance,ytrain)
train_duration = pd.crosstab(Xtrain.duration,ytrain)
train_day = pd.crosstab(Xtrain.day,ytrain)
train_campaign = pd.crosstab(Xtrain.campaign,ytrain)
train_pdays = pd.crosstab(Xtrain.pdays,ytrain)
train_previous = pd.crosstab(Xtrain.previous,ytrain)
#returns four values, 𝜒2 value, p-value, degree of freedom and expected values.
data_column = [train_age,train_balance,train_duration,train_day,train_campaign,train_pdays,train_previous,train_job,train_marital,train_edcucation,train_default,train_contact,train_house,train_loan,train_month]
print("P values of every column")
n=1
for x in data_column:
chi, pval, dof, exp = chi2_contingency(x)
significance = 0.05
print(n,'. -------------------------------',x.index.name,'---------------------------------')
print('p-value=%.6f, significance=%.2f\n' % (pval, significance))
if pval < significance:
print("""At %.2f level of significance, we reject the null hypotheses and accept H1.
They are not independent.""" % (significance))
else:
print("""At %.2f level of significance, we accept the null hypotheses.
They are independent.""" % (significance))
#print(x.index.name," = " ,chi2_contingency(x)[1]) # print p values
print(' --------------------------------------------------------------------------------\n\n')
n+=1
P values of every column 1 . ------------------------------- age --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 2 . ------------------------------- balance --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 3 . ------------------------------- duration --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 4 . ------------------------------- day --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 5 . ------------------------------- campaign --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 6 . ------------------------------- pdays --------------------------------- p-value=1.000000, significance=0.05 At 0.05 level of significance, we accept the null hypotheses. They are independent. -------------------------------------------------------------------------------- 7 . ------------------------------- previous --------------------------------- p-value=1.000000, significance=0.05 At 0.05 level of significance, we accept the null hypotheses. They are independent. -------------------------------------------------------------------------------- 8 . ------------------------------- job --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 9 . ------------------------------- marital --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 10 . ------------------------------- education --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 11 . ------------------------------- default --------------------------------- p-value=0.055944, significance=0.05 At 0.05 level of significance, we accept the null hypotheses. They are independent. -------------------------------------------------------------------------------- 12 . ------------------------------- contact --------------------------------- p-value=0.001030, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 13 . ------------------------------- housing --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 14 . ------------------------------- loan --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. -------------------------------------------------------------------------------- 15 . ------------------------------- month --------------------------------- p-value=0.000000, significance=0.05 At 0.05 level of significance, we reject the null hypotheses and accept H1. They are not independent. --------------------------------------------------------------------------------
The Purpose of Chi-Square Test is carried out to investigate the dependency between the both categorical x and y. Based on the results below, the attribute job, marital, education, defualt, housing, loan, contact, month, poutcome, age, balance, duration, day, campaign, pdays, and previous are not independent to subscribed, we reject the null hypotheses and accept H1.
The sklearn LabelEncoder function is utilized to encode the categorical variables such as job, marital, education, contact, poutcome, default, housing, loan, subscribed and month.
def column_encoding(df_x , df_y):
df = pd.concat([df_x,df_y],axis=1)
label_encoder = preprocessing.LabelEncoder()
nominal_cols = ['job', 'marital','education' , 'contact']
for name in nominal_cols:
df[name] = label_encoder.fit_transform(df[name])
df[name].value_counts()
#encoding 'default' , 'housing', 'loan' attributes
# 1 is yes , 0 is no
mapping_dictionary = {"default" :{"yes" : 1 , "no" : 0},
"housing" :{"yes" : 1 , "no" : 0},
"loan" :{"yes" : 1 , "no" : 0} ,
"subscribed" : {"yes" : 1 , "no" : 0}}
df = df.replace(mapping_dictionary)
#month
replace_dictionary = { "month" : {"jan" : 1 ,
"feb" : 2,
"mar" : 3,
"apr" : 4,
"may" : 5,
"jun" : 6,
"jul" : 7,
"aug" : 8,
"sep" : 9,
"oct" : 10,
"nov" : 11,
"dec" : 12}}
df.replace(replace_dictionary , inplace=True)
df_y = df.subscribed
df_x = df.drop('subscribed', axis=1)
return df_x , df_y
Xtrain , ytrain = column_encoding(Xtrain , ytrain)
Xtest , ytest = column_encoding(Xtest , ytest)
After performing label encoding operation for both train and test datasets, the categorical values are converted to numerical values.
Xtrain.head()
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 823 | 33 | 1 | 2 | 0 | 0 | 95 | 1 | 0 | 0 | 7 | 5 | 102 | 1 | -1 | 0 |
| 31304 | 19 | 8 | 2 | 1 | 0 | 1803 | 0 | 0 | 0 | 10 | 3 | 59 | 1 | -1 | 0 |
| 15594 | 25 | 1 | 1 | 1 | 0 | 895 | 0 | 0 | 0 | 21 | 7 | 317 | 3 | -1 | 0 |
| 14670 | 45 | 4 | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 15 | 7 | 112 | 4 | -1 | 0 |
| 26345 | 32 | 1 | 0 | 1 | 0 | 658 | 0 | 0 | 0 | 20 | 11 | 173 | 2 | -1 | 0 |
Xtest.head()
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8165 | 24 | 1 | 2 | 0 | 0 | 147 | 1 | 0 | 0 | 2 | 6 | 102 | 2 | -1 | 0 |
| 21927 | 32 | 9 | 1 | 1 | 0 | 20 | 0 | 0 | 0 | 20 | 8 | 210 | 2 | -1 | 0 |
| 14416 | 47 | 9 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 14 | 7 | 81 | 2 | -1 | 0 |
| 35457 | 48 | 4 | 0 | 1 | 0 | 122 | 1 | 0 | 1 | 7 | 5 | 325 | 1 | -1 | 0 |
| 30863 | 52 | 4 | 1 | 2 | 0 | 749 | 0 | 1 | 0 | 9 | 2 | 342 | 2 | -1 | 0 |
ytrain
823 0
31304 0
15594 0
14670 0
26345 0
..
7751 0
20609 0
7813 0
43723 0
32511 0
Name: subscribed, Length: 22822, dtype: int64
ytest
8165 0
21927 0
14416 0
35457 0
30863 0
..
26293 0
2402 0
6976 0
7235 0
16819 0
Name: subscribed, Length: 6141, dtype: int64
ytrain.value_counts()
0 21514 1 1308 Name: subscribed, dtype: int64
ytest.value_counts()
0 5796 1 345 Name: subscribed, dtype: int64
The code below is is used to split the Xtrain into 2 different categories which one consists of categorical data while the other consists of numerical data.
categorical_var_train = Xtrain[['contact','education','default','housing','loan','job','marital',
'month']]
numerical_var_train = Xtrain.drop(['contact','education','default','housing','loan','job','marital',
'month'],axis=1)
categorical_var_test = Xtest[['contact','education','default','housing','loan','job','marital',
'month']]
numerical_var_test = Xtest.drop(['contact','education','default','housing','loan','job','marital',
'month'],axis=1)
Since there are numerical data (continuous) in the table, StandardScaler was used to scales each input variable separately by subtracting the mean and dividing by the standard deviation in order to have a distribution of mean of zero and a standard deviation of one.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_Xtrain = scaler.fit_transform(numerical_var_train)
scaled_Xtest = scaler.transform(numerical_var_test)
#Scalling Xtrain
scaled_Xtrain = pd.DataFrame(scaled_Xtrain)
scaled_Xtrain = scaled_Xtrain.reset_index()
categorical_var_train = categorical_var_train.reset_index()
#Scalling Xtest
scaled_Xtest = pd.DataFrame(scaled_Xtest)
scaled_Xtest = scaled_Xtest.reset_index()
categorical_var_test = categorical_var_test.reset_index()
#Merging Xtrain
Xtrain = pd.concat([scaled_Xtrain,categorical_var_train],axis=1)
Xtrain = Xtrain.drop(['index','index'],axis=1)
#Merging Xtest
Xtest = pd.concat([scaled_Xtest,categorical_var_test],axis=1)
Xtest = Xtest.drop(['index','index'],axis=1)
#Check Xtrain
Xtrain.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | contact | education | default | housing | loan | job | marital | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.740194 | -0.616839 | -1.048754 | -0.738912 | -0.882747 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 1 | 2 | 5 |
| 1 | -2.146046 | 1.503115 | -0.690026 | -1.046542 | -0.882747 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 8 | 2 | 3 |
| 2 | -1.543538 | 0.376113 | 0.625308 | 0.799238 | 0.603827 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 7 |
| 3 | 0.464821 | -0.734752 | -0.092147 | -0.667370 | 1.347115 | 0.0 | 0.0 | 0 | 2 | 0 | 0 | 1 | 4 | 1 | 7 |
| 4 | -0.840612 | 0.081951 | 0.505732 | -0.230965 | -0.139460 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 11 |
| 5 | 0.966911 | -0.734752 | -0.809602 | -0.760375 | -0.139460 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 4 | 1 | 7 |
| 6 | 0.063150 | 0.774536 | 0.625308 | -1.089468 | 0.603827 | 0.0 | 0.0 | 0 | 2 | 0 | 1 | 0 | 4 | 2 | 5 |
| 7 | -0.037268 | 3.151416 | 0.984036 | 1.400190 | -0.882747 | 0.0 | 0.0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 7 |
| 8 | 1.770255 | 2.415390 | 1.821067 | 0.727696 | 2.833689 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 5 | 0 | 7 |
| 9 | 0.063150 | -1.104627 | -1.287906 | 1.371573 | -0.882747 | 0.0 | 0.0 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 5 |
#Check Xtest
Xtest.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | contact | education | default | housing | loan | job | marital | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.643956 | -0.552297 | -1.646633 | -0.738912 | -0.139460 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 1 | 2 | 6 |
| 1 | -0.840612 | -0.709928 | 0.505732 | 0.033740 | -0.139460 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 9 | 1 | 8 |
| 2 | 0.665657 | -0.734752 | -0.211723 | -0.889150 | -0.139460 | 0.0 | 0.0 | 1 | 1 | 0 | 0 | 0 | 9 | 1 | 7 |
| 3 | 0.766075 | -0.583327 | -1.048754 | 0.856472 | -0.882747 | 0.0 | 0.0 | 1 | 1 | 0 | 1 | 0 | 4 | 0 | 5 |
| 4 | 1.167747 | 0.194900 | -0.809602 | 0.978093 | -0.139460 | 0.0 | 0.0 | 0 | 2 | 0 | 0 | 1 | 4 | 1 | 2 |
| 5 | -1.242284 | -0.678899 | 0.027429 | 0.999555 | -0.882747 | 0.0 | 0.0 | 0 | 1 | 0 | 1 | 0 | 7 | 2 | 5 |
| 6 | 1.870673 | 1.192817 | 0.266581 | -0.903459 | -0.139460 | 0.0 | 0.0 | 0 | 2 | 0 | 0 | 1 | 2 | 1 | 11 |
| 7 | -0.840612 | -0.373566 | 1.342763 | -0.610137 | 2.833689 | 0.0 | 0.0 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 6 |
| 8 | -0.137686 | -0.734752 | -1.168330 | 0.405758 | -0.139460 | 0.0 | 0.0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 5 |
| 9 | 1.268165 | 1.542833 | 0.625308 | -0.152269 | -0.139460 | 0.0 | 0.0 | 0 | 1 | 0 | 1 | 0 | 9 | 1 | 11 |
#PCA
def cumulative_variance_pca(model, X):
model.fit(X)
# Calculate cumulative explained variance across all PCs
cum_exp_var = []
var_exp = 0
for i in pca.explained_variance_ratio_:
var_exp += i
cum_exp_var.append(var_exp)
# Plot cumulative explained variance for all PCs
fig, ax = plt.subplots(figsize=(8,6))
ax.bar(range(1,4), cum_exp_var)
ax.set_xlabel('# Principal Components')
ax.set_ylabel('% Cumulative Variance Explained')
return(model)
# Principal Component Analysis
from sklearn.decomposition import PCA
# Setup the PCA with PC = 3
pca = PCA(n_components=3)
# Train the model
pca_trained = cumulative_variance_pca(pca,Xtrain)
# Create dataframe consist of PC and Y
X_pca = pca_trained.transform(Xtrain)
# Combine everything
df_PCA = pd.DataFrame(X_pca,columns=["PC1","PC2","PC3"])
df_PCA["subscribed"] = ytrain
df_PCA.tail()
| PC1 | PC2 | PC3 | subscribed | |
|---|---|---|---|---|
| 22817 | -3.308359 | -0.874095 | 0.630128 | 0.0 |
| 22818 | -0.080381 | 1.796774 | -1.540835 | NaN |
| 22819 | -0.259435 | -0.971939 | 0.815511 | 0.0 |
| 22820 | 4.695387 | -1.252719 | 0.617694 | 0.0 |
| 22821 | -0.361213 | -2.106864 | -0.116003 | 0.0 |
#PCA Visualization
# 2 Dimensional PCA Plot
plt.figure(figsize=(20, 10))
sns.scatterplot(df_PCA["PC1"], df_PCA["PC2"], hue=df_PCA["subscribed"], s=200)
plt.show()
C:\Users\ASUS\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
# 3D Plot Visualization (PC 1, PC 2, and PC 3)
import plotly.express as px
fig = px.scatter_3d(df_PCA, x='PC1', y='PC2', z='PC3',color='subscribed')
fig.show()
For Logistic Regression, a logistic regression classifier is implemented. Hyperparameters are tuned using GridSearchCV and model is then fit to training data.
# Instantiate classifier
logistic_regression = LogisticRegression()
# Set up hyperparameter grid for tuning
logistic_regression_param_grid = {'C' : [0.0001, 0.001, 0.01, 0.05, 0.1] }
# Tune hyperparameters
logistic_regression_model = GridSearchCV(logistic_regression, param_grid = logistic_regression_param_grid, cv = 5)
# Fit model to training data
logistic_regression_model.fit(Xtrain, ytrain)
GridSearchCV(cv=5, estimator=LogisticRegression(),
param_grid={'C': [0.0001, 0.001, 0.01, 0.05, 0.1]})
Predict the test set results and calculate the accuracy.
# Predict test data on logistic regression
print(f"Test Accuracy : {logistic_regression_model.score(Xtest, ytest)*100} %" )
print(f"Train Accuracy : {logistic_regression_model.score(Xtrain, ytrain)*100} %" )
# Obtain model performance metrics
lr_pred_prob = logistic_regression_model.predict_proba(Xtest)[:,1]
lr_auroc = roc_auc_score(ytest, lr_pred_prob)
Test Accuracy : 94.33317049340498 % Train Accuracy : 94.28621505564806 %
KNN is an algorithm which it is non-parametric and lazy (instance based) because it doesn't have a specialized training phase. In this section, the grid search algorithm was used to find the best parameters for the k values in order to have the best accuracy. In this case, k value of 14 provides the highest accuracy score.
# finding the K value
error = []
for i in range(1,31,2):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(Xtrain,ytrain)
preds = knn.predict(Xtest)
error.append(np.mean(preds!=ytest))
plt.plot(range(1,31,2), error, linestyle = 'dashed', marker ='o', mfc= 'red')
[<matplotlib.lines.Line2D at 0x24f9f359f70>]
error_rate = []
for i in range(1,50):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(Xtrain, ytrain)
pred = knn.predict(Xtest)
error_rate.append(np.mean(pred != ytest))
plt.figure(figsize=(15,10))
plt.plot(range(1,50),error_rate, marker='o', markersize=9)
[<matplotlib.lines.Line2D at 0x24f9aba4370>]
#finding K value using sqrt(n)
#import math
#math.sqrt(len(ytest))
#Fit the training sets to the model.
knn = KNeighborsClassifier(n_neighbors = 5, p=2, metric='minkowski')
knn.fit(Xtrain, ytrain)
KNeighborsClassifier()
#knn_model = knn.predict(Xtest)
#print(f"Accuracy : {accuracy_score(ytest, knn_model)*100} %" )
knn_model = knn.predict(Xtest)
knn_modeltrain = knn.predict(Xtrain)
#knn.score(scaled_Xtest, ytest)
#accuracy_score(ytest, knn_model)
print(f"Test Accuracy : {accuracy_score(ytest, knn_model)*100} %" )
print(f"Train Accuracy : {accuracy_score(ytrain, knn_modeltrain)*100} %" )
Test Accuracy : 94.69141833577594 % Train Accuracy : 95.31154149504863 %
Naive Bayes is a simple "probabilistic classifiers" which based on applying Bayes' theorem with strong (naïve) independence assumptions between the features (X) and it is useful for very large dataset. In this section, GaussionNB was imported from sklearn and Xtrain, ytrain were bring fitted into the model in order to do prediction.
from sklearn.naive_bayes import GaussianNB # 1. choose model class
naive_model = GaussianNB() # 2. instantiate model
Fit the training sets to the model.
naive_model.fit(Xtrain, ytrain) # 3. fit model to data
GaussianNB()
y_naive_model_model = naive_model.predict(Xtest)
y_naive_model_model_train = naive_model.predict(Xtrain)
from sklearn.metrics import accuracy_score
#accuracy_score(ytest, y_naive_model_model)
print(f"Test Accuracy : {accuracy_score(ytest, y_naive_model_model)*100} %" )
print(f"Train Accuracy : {accuracy_score(ytrain, y_naive_model_model_train)*100} %" )
Test Accuracy : 91.38576779026218 % Train Accuracy : 92.11287354307247 %
A linear SVM was chosen by using the SVC classifier to make prediction on Xtest dataset by fitting the Xtrain and ytrain datasets into the SVM model. Accuracy was calculated and displyed by using score syntax.
# Import necessary library
from sklearn.svm import SVC
# Create a linear SVM classifier
svm_model = SVC(kernel='linear', probability=True)
# Train classfier
svm_model.fit(Xtrain, ytrain)
# Take the model that was trained on the Xtrain data and apply it to the Xtest
y_pred_svm = svm_model.predict(Xtest)
# Calculation of accuracy Data Test
svmScore = svm_model.score(Xtest, ytest)
svmScoreTrain = svm_model.score(Xtrain,ytrain)
# Calculation of confusion matrix
svmMatrix = confusion_matrix(ytest, y_pred_svm)
# Print output
print(f"Test Accuracy : {svmScore*100} %")
print(f"Train Accuracy : {svmScoreTrain*100} %")
Test Accuracy : 94.3820224719101 % Train Accuracy : 94.26868810796599 %
The evaluation mathod used involving confusion metrix, precision-recall curve and also learning curve.
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate 3 plots: the test and training learning curve, the training
samples vs fit times curve, the fit times vs score curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
axes : array of 3 axes, optional (default=None)
Axes to use for plotting the curves.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 5-fold cross-validation,
- integer, to specify the number of folds.
- :term:`CV splitter`,
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : int or None, optional (default=None)
Number of jobs to run in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
train_sizes : array-like, shape (n_ticks,), dtype float or int
Relative or absolute numbers of training examples that will be used to
generate the learning curve. If the dtype is float, it is regarded as a
fraction of the maximum size of the training set (that is determined
by the selected validation method), i.e. it has to be within (0, 1].
Otherwise it is interpreted as absolute sizes of the training sets.
Note that for classification the number of samples usually have to
be big enough to contain at least one sample from each class.
(default: np.linspace(0.1, 1.0, 5))
"""
if axes is None:
_, axes = plt.subplots(1, 3, figsize=(20, 5))
axes[0].set_title(title)
if ylim is not None:
axes[0].set_ylim(*ylim)
axes[0].set_xlabel("Training examples")
axes[0].set_ylabel("Score")
train_sizes, train_scores, test_scores, fit_times, _ = \
learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
train_sizes=train_sizes,
return_times=True)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
# Plot learning curve
axes[0].grid()
axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
axes[0].legend(loc="best")
# Plot n_samples vs fit_times
axes[1].grid()
axes[1].plot(train_sizes, fit_times_mean, 'o-')
axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
fit_times_mean + fit_times_std, alpha=0.1)
axes[1].set_xlabel("Training examples")
axes[1].set_ylabel("fit_times")
axes[1].set_title("Scalability of the model")
# Plot fit_time vs score
axes[2].grid()
axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1)
axes[2].set_xlabel("fit_times")
axes[2].set_ylabel("Score")
axes[2].set_title("Performance of the model")
return plt
title = "Learning Curves (Logistic Regression)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = LogisticRegression(random_state = 30)
plot_learning_curve(logistic_regression_model, title, Xtrain, ytrain, ylim=(0.7, 1.01),
cv=cv, n_jobs=4)
plt.show()
title = "Learning Curves (KNN)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = KNeighborsClassifier()
plot_learning_curve(knn, title, Xtrain, ytrain,ylim=(0.7, 1.01),
cv=cv, n_jobs=4)
plt.show()
title = "Learning Curves (Naive Bayes)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = GaussianNB()
plot_learning_curve(naive_model, title, Xtrain, ytrain,ylim=(0.7, 1.01),
cv=cv, n_jobs=4)
plt.show()
title = "Learning Curves (Linear SVM)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = SVC()
plot_learning_curve(svm_model, title, Xtrain, ytrain, ylim=(0.7, 1.01),
cv=cv, n_jobs=4)
plt.show()
def res(y_test_valid,y_train_valid):
cm_log = confusion_matrix(ytest,y_test_valid)
ConfusionMatrixDisplay(cm_log).plot()
print(classification_report(ytest,y_test_valid))
print("======================Data Train Result Evaluation========================")
print('train_accuracy:',accuracy_score(ytrain,y_train_valid))
print('F1 score:', f1_score(ytrain,y_train_valid))
print('Recall:', recall_score(ytrain,y_train_valid))
print ('Precision:', precision_score(ytrain,y_train_valid))
print("======================Data Test Result Evaluation========================")
print('test_accuracy:',accuracy_score(ytest,y_test_valid))
print('F1 score:', f1_score(ytest, y_test_valid))
print('Recall:', recall_score(ytest,y_test_valid))
print ('Precision:', precision_score(ytest,y_test_valid))
#print ('Precision:', precision_score(ytest, model,pos_label=1))
#print ('Accuracy:', accuracy_score(ytest, model))
#print ('F1 score:', f1_score(ytest, model,pos_label=1))
#print ('Recall:', recall_score(ytest, model,pos_label=1))
#print ('\n clasification report:\n', classification_report(ytest,model,target_names=target_names))
def model_evaluation(model,name):
confusion_matrix = pd.crosstab(ytest, model, rownames=['Actual'], colnames=['Predicted'], margins = True)
sns.heatmap(confusion_matrix, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('Prediction label')
plt.ylabel('True Label');
plt.title(name)
plt.yticks([0.5,1.5], [ 'NO', 'YES'],va='center')
plt.xticks([0.5,1.5], [ 'NO', 'YES'],va='center')
plt.show()
target_names = ['No' , 'Yes']
print ('Precision:', precision_score(ytest, model,pos_label=1))
print ('Accuracy:', accuracy_score(ytest, model))
print ('F1 score:', f1_score(ytest, model,pos_label=1))
print ('Recall:', recall_score(ytest, model,pos_label=1))
print ('\n clasification report:\n', classification_report(ytest,model,target_names=target_names))
#log_model= LogisticRegression()
#log_model.fit(Xtrain,ytrain)
#y_pred_log = log_model.predict(Xtest)
#y_pred_train = log_model.predict(Xtrain)
#res(y_pred_log,y_pred_train)
lr_y_pred = logistic_regression_model.predict(Xtest)
model_evaluation(lr_y_pred,"Logistic Regression")
Precision: 0.42105263157894735
Accuracy: 0.9433317049340498
F1 score: 0.04395604395604395
Recall: 0.02318840579710145
clasification report:
precision recall f1-score support
No 0.94 1.00 0.97 5796
Yes 0.42 0.02 0.04 345
accuracy 0.94 6141
macro avg 0.68 0.51 0.51 6141
weighted avg 0.92 0.94 0.92 6141
plot_roc_curve(logistic_regression_model,Xtest,ytest)
plt.show()
#knn = KNeighborsClassifier(n_neighbors=77)
#knn_model.fit(Xtrain,ytrain)
#y_pred_knn = knn_model.predict(Xtest)
#y_pred_knn_train = knn_model.predict(Xtrain)
#res(y_pred_knn,y_pred_knn_train)
model_evaluation(knn_model,"KNN")
Precision: 0.6376811594202898
Accuracy: 0.9469141833577593
F1 score: 0.21256038647342995
Recall: 0.12753623188405797
clasification report:
precision recall f1-score support
No 0.95 1.00 0.97 5796
Yes 0.64 0.13 0.21 345
accuracy 0.95 6141
macro avg 0.79 0.56 0.59 6141
weighted avg 0.93 0.95 0.93 6141
plot_roc_curve(knn,Xtest,ytest)
plt.show()
#use penalized learning algorithms that increase the cost of classification mistakes on the minority class.
svm_model = SVC(class_weight='balanced', probability=True)
svm_model.fit(Xtrain,ytrain)
y_pred_svm = svm_model.predict(Xtest)
y_pred_svm_train = svm_model.predict(Xtrain)
res(y_pred_svm,y_pred_svm_train)
precision recall f1-score support
0 0.99 0.77 0.87 5796
1 0.17 0.81 0.29 345
accuracy 0.77 6141
macro avg 0.58 0.79 0.58 6141
weighted avg 0.94 0.77 0.83 6141
======================Data Train Result Evaluation========================
train_accuracy: 0.7802997108053632
F1 score: 0.3108851017042331
Recall: 0.8646788990825688
Precision: 0.1895107238605898
======================Data Test Result Evaluation========================
test_accuracy: 0.7741410193779514
F1 score: 0.28615542974781266
Recall: 0.8057971014492754
Precision: 0.1739674593241552
model_evaluation(y_pred_svm,"Linear SVM")
Precision: 0.1739674593241552
Accuracy: 0.7741410193779514
F1 score: 0.28615542974781266
Recall: 0.8057971014492754
clasification report:
precision recall f1-score support
No 0.99 0.77 0.87 5796
Yes 0.17 0.81 0.29 345
accuracy 0.77 6141
macro avg 0.58 0.79 0.58 6141
weighted avg 0.94 0.77 0.83 6141
plot_roc_curve(svm_model,Xtest,ytest)
plt.show()
model_evaluation(y_naive_model_model,"Naive Bayes")
Precision: 0.21069182389937108
Accuracy: 0.9138576779026217
F1 score: 0.20211161387631976
Recall: 0.19420289855072465
clasification report:
precision recall f1-score support
No 0.95 0.96 0.95 5796
Yes 0.21 0.19 0.20 345
accuracy 0.91 6141
macro avg 0.58 0.58 0.58 6141
weighted avg 0.91 0.91 0.91 6141
plot_roc_curve(naive_model,Xtest,ytest)
plt.show()